%load_ext nb_black
import pandas as pd
from datetime import timedelta, datetime
import plotly.express as px
from sklearn.linear_model import LinearRegression
import numpy as np
from plotly import graph_objects as go
data = pd.read_csv("owid-covid-data.csv")
data = data[data["location"] == "Russia"]
data.new_cases[data.new_cases == 0] = 1
data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
data.index = data["date"]
data.drop(columns=["date"], inplace=True)
start_date = datetime(year=2020, month=3, day=3)
stop_train_date = start_date + timedelta(days=50)
data = data[(data.index >= start_date)]
# data = data[(data.index >= start_date) & (data.index < start_date + timedelta(days=50))]
# data = data[data.columns.difference(data.columns[data.isna().all()].tolist())]
target = data[["total_cases", "new_cases"]]
data.drop(columns=["total_cases", "new_cases"], inplace=True)
display(data.head())
display(target.head())
| iso_code | continent | location | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | total_cases_per_million | new_cases_per_million | new_cases_smoothed_per_million | ... | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||
| 2020-03-03 | RUS | Europe | Russia | 0.143 | NaN | 0.0 | 0.0 | 0.021 | 0.007 | 0.001 | ... | 58.3 | NaN | 8.05 | 72.58 | 0.824 | 144713312.0 | NaN | NaN | NaN | NaN |
| 2020-03-04 | RUS | Europe | Russia | 0.143 | NaN | 0.0 | 0.0 | 0.021 | 0.000 | 0.001 | ... | 58.3 | NaN | 8.05 | 72.58 | 0.824 | 144713312.0 | NaN | NaN | NaN | NaN |
| 2020-03-05 | RUS | Europe | Russia | 0.143 | NaN | 0.0 | 0.0 | 0.021 | 0.000 | 0.001 | ... | 58.3 | NaN | 8.05 | 72.58 | 0.824 | 144713312.0 | NaN | NaN | NaN | NaN |
| 2020-03-06 | RUS | Europe | Russia | 0.286 | NaN | 0.0 | 0.0 | 0.028 | 0.007 | 0.002 | ... | 58.3 | NaN | 8.05 | 72.58 | 0.824 | 144713312.0 | NaN | NaN | NaN | NaN |
| 2020-03-07 | RUS | Europe | Russia | 0.286 | NaN | 0.0 | 0.0 | 0.028 | 0.000 | 0.002 | ... | 58.3 | NaN | 8.05 | 72.58 | 0.824 | 144713312.0 | NaN | NaN | NaN | NaN |
5 rows × 64 columns
| total_cases | new_cases | |
|---|---|---|
| date | ||
| 2020-03-03 | 3.0 | 1.0 |
| 2020-03-04 | 3.0 | 1.0 |
| 2020-03-05 | 3.0 | 1.0 |
| 2020-03-06 | 4.0 | 1.0 |
| 2020-03-07 | 4.0 | 1.0 |
px.line(target[target.index < stop_train_date], markers=True)
(
f"last total: {target[target.index < stop_train_date].total_cases[-1]}",
f"sum new: {sum(target[target.index < stop_train_date].new_cases)}",
)
('last total: 52763.0', 'sum new: 52772.0')
Очевидно, что new_cases можно пересчитать в total_cases (есть незначительные ошибки в данных). Но, new_cases может убывать, а т.к. мы априори берём неубывающие функции, логично пользоваться именно total_cases.
df = target[["new_cases", "total_cases"]]
df["x"] = np.arange(df.shape[0])
assert all(
timedelta(days=1) == d for d in df.index[1:] - df.index[:-1]
) # проверка, что данные есть за каждый день
/tmp/ipykernel_140/346176215.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
train = df[target.index < start_date + timedelta(days=50)]
test = df[
(df.index >= start_date + timedelta(days=50))
& (df.index <= datetime(year=2020, month=9, day=1))
]
px.line(test.total_cases, markers=True)
Первая модель имеет вид $$ y=c_1e^{c_2x+c_3} $$, тогда $$ ln(y)=(c_2x+c_3)ln(c_1e)$$ $$ln(y)=ax+b$$ Поэтому нужно прологорифмировать данные
model = LinearRegression()
x_train = train.x.to_numpy().reshape(-1, 1)
y_train = train.total_cases.to_numpy().flatten()
model = model.fit(x_train, np.log(y_train))
f"ln(y)={model.intercept_:.3}+{model.coef_[0]:.2}x"
'ln(y)=1.19+0.21x'
pred_train = np.exp(model.predict(x_train))
fig = go.Figure()
fig.add_scatter(y=pred_train, name="predict", x=train.index)
fig.add_scatter(y=y_train, name="real", x=train.index)
sigma_noise = np.std(np.log(y_train) - np.log(pred_train))
sigma_noise
0.4498899109116323
def bayesian_update(mu, sigma, x, y, sigma_noise):
x_matrix = np.array([np.hstack(([1], x))])
sigma_n = np.linalg.inv(
np.linalg.inv(sigma)
+ (1 / (sigma_noise**2)) * np.matmul(np.transpose(x_matrix), x_matrix)
)
mu_n = np.matmul(
sigma_n,
np.matmul(np.linalg.inv(sigma), np.transpose(mu))
+ (1 / (sigma_noise**2)) * np.matmul(np.transpose(x_matrix), np.array([y])),
)
return mu_n, sigma_n
def bayesian_update_loop(x, y, sigma_noise):
mu = np.zeros(x.shape[1] + 1)
sigma = 2 * np.diag(np.ones(x.shape[1] + 1))
for obj, val in zip(x, y):
mu, sigma = bayesian_update(mu, sigma, obj, np.log(val), sigma_noise)
return mu, sigma
mu, sigma = bayesian_update_loop(x=x_train, y=y_train, sigma_noise=sigma_noise)
mu, sigma
(array([1.18416525, 0.21438625]),
array([[ 1.55931945e-02, -4.72519863e-04],
[-4.72519863e-04, 1.93255611e-05]]))
samples_count = 50
def draw_samples(fig, data, mu, sigma, count=samples_count):
pd.options.mode.chained_assignment = None
np.random.seed(0)
samples = np.random.multivariate_normal(mu, sigma, count)
for idx, (b, a) in enumerate(samples):
data.loc[:, f"sample_{idx}"] = np.exp(a * data.x.to_numpy().flatten() + b)
fig.add_scatter(
y=data[f"sample_{idx}"],
showlegend=False,
line_color="grey",
opacity=0.1,
x=data.index,
)
fig = go.Figure()
draw_samples(fig, train, mu, sigma)
fig.add_scatter(y=pred_train, name="predict", line_color="green", x=train.index)
fig.add_scatter(y=y_train, name="real", x=train.index)
fig.update_layout(title="Данные на обучении")
y_test = test.total_cases.to_numpy().flatten()
fig = go.Figure()
draw_samples(fig, test, mu, sigma)
fig.add_scatter(y=y_test, name="real", x=test.index)
fig.update_layout(title="Данные на тесте")
fig.update_layout(yaxis_range=[0, 1e6])
samples = test.loc["2020-5-1"].filter(regex=("sample_*"))
display(f"предсказание: {int(samples.mean())}")
px.histogram(samples)
'предсказание: 1047015'
samples = test.loc["2020-6-1"].filter(regex=("sample_*"))
display(f"предсказание: {int(samples.mean())}")
px.histogram(samples)
'предсказание: 838242656'
samples = test.loc["2020-9-1"].filter(regex=("sample_*"))
display(f"предсказание: {int(samples.mean())}")
px.histogram(samples)
'предсказание: 388038639395832128'
Во второй модели необходимо учесть квадратичную зависимость. Поэтому необходимо добавить квадратичный признак. Также наличие интеграла (суммирования) подталкивает на мысль использовать данные new_cases для обучения модели, чтобы из них получать total_cases.
model = LinearRegression()
x_train = np.hstack(
(train.x.to_numpy().reshape(-1, 1), train.x.to_numpy().reshape(-1, 1) ** 2)
)
y_train = train.new_cases.to_numpy().flatten()
model = model.fit(x_train, np.log(y_train))
f"ln(y)={model.intercept_:.3}+{model.coef_[0]:.2}x+{model.coef_[1]:.2}x^2"
'ln(y)=-0.873+0.26x+-0.0012x^2'
pred_train = np.exp(model.predict(x_train))
fig = go.Figure()
fig.add_scatter(y=pred_train, name="predict", x=train.index)
fig.add_scatter(y=y_train, name="real", x=train.index)
sigma_noise = np.std(np.log(y_train) - np.log(pred_train))
sigma_noise # сигма получилась больше, т.к. в данном случае целевая переменная - new_cases
1.1962232272133029
mu, sigma = bayesian_update_loop(x=x_train, y=y_train, sigma_noise=sigma_noise)
mu, sigma
(array([-0.77760687, 0.2505384 , -0.00105931]),
array([[ 2.12555789e-01, -1.71684387e-02, 2.88989973e-04],
[-1.71684387e-02, 1.95321068e-03, -3.76494139e-05],
[ 2.88989973e-04, -3.76494139e-05, 7.78336878e-07]]))
def draw_samples(fig, data, mu, sigma, count=samples_count):
pd.options.mode.chained_assignment = None
np.random.seed(0)
samples = np.random.multivariate_normal(mu, sigma, count)
for idx, (a, b, c) in enumerate(samples):
data.loc[:, f"sample_{idx}"] = np.cumsum(
np.exp(
a
+ b * data.x.to_numpy().flatten()
+ c * (data.x.to_numpy().flatten() ** 2)
)
)
fig.add_scatter(
y=data[f"sample_{idx}"],
showlegend=False,
line_color="grey",
opacity=0.1,
x=data.index,
)
fig = go.Figure()
draw_samples(fig, train, mu, sigma)
fig.add_scatter(
y=np.cumsum(pred_train), name="predict", line_color="green", x=train.index
)
fig.add_scatter(y=train.total_cases.to_numpy().flatten(), name="real", x=train.index)
fig.update_layout(title="Данные на обучении")
y_test = test.total_cases.to_numpy().flatten()
fig = go.Figure()
draw_samples(fig, test, mu, sigma)
fig.add_scatter(y=y_test, name="real", x=test.index)
fig.add_scatter(
y=np.percentile(test.filter(regex=("sample_*")), 10, axis=1),
name="10 percentile",
x=test.index,
)
fig.add_scatter(
y=np.percentile(test.filter(regex=("sample_*")), 90, axis=1),
name="90 percentile",
x=test.index,
)
fig.add_scatter(
y=np.mean(test.filter(regex=("sample_*")), axis=1),
name="mean",
x=test.index,
)
fig.update_layout(title="Данные на тесте")
fig.update_layout(yaxis_range=[0, 1e6])
Естественно получилось лучше, чем экспоненциальный рост
def process(country_name, target, train_days=50, ylim=[0, 2e6]):
target = target.copy()
target["x"] = np.arange(target.shape[0])
train = target[target.index < target.index[0] + timedelta(days=train_days)]
target = target[target.index <= datetime(year=2020, month=10, day=1)]
model = LinearRegression()
x_train = np.hstack(
(train.x.to_numpy().reshape(-1, 1), train.x.to_numpy().reshape(-1, 1) ** 2)
)
y_train = train.new_cases.to_numpy().flatten()
model = model.fit(x_train, np.log(y_train))
display(f"ln(y)={model.intercept_:.3}+{model.coef_[0]:.2}x+{model.coef_[1]:.2}x^2")
pred_train = np.exp(model.predict(x_train))
sigma_noise = np.std(np.log(y_train) - np.log(pred_train))
mu, sigma = bayesian_update_loop(x=x_train, y=y_train, sigma_noise=sigma_noise)
y = target.total_cases.to_numpy().flatten()
fig = go.Figure()
# display(target)
draw_samples(fig, target, mu, sigma)
fig.add_scatter(y=y, name="real", x=target.index)
fig.add_scatter(
y=np.percentile(target.filter(regex=("sample_*")), 10, axis=1),
name="10 percentile",
x=target.index,
)
fig.add_scatter(
y=np.percentile(target.filter(regex=("sample_*")), 90, axis=1),
name="90 percentile",
x=target.index,
)
fig.add_scatter(
y=np.mean(target.filter(regex=("sample_*")), axis=1),
name="mean",
x=target.index,
)
fig.update_layout(title=f"Данные по {country_name}")
fig.update_layout(yaxis_range=y_lim)
display(fig)
return model.intercept_, model.coef_[0], model.coef_[1]
data = pd.read_csv("owid-covid-data.csv")
data.new_cases[data.new_cases == 0] = 1
data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
data.index = data["date"]
data.drop(columns=["date"], inplace=True)
points = []
for country_name, y_lim in zip(
("Russia", "Germany", "Italy", "China"), ([0, 2e6], [0, 2e6], [0, 2e6], [0, 2e5])
):
target = data[["total_cases", "new_cases"]]
target = target[data["location"] == country_name]
target = target[target.index >= target[target.total_cases > 2].index[0]]
w0, w1, w2 = process(country_name, target)
points.append({"name": country_name, "w0": w0, "w1": w1, "w2": w2})
'ln(y)=-0.873+0.26x+-0.0012x^2'
'ln(y)=0.77+-0.14x+0.0062x^2'
'ln(y)=-1.17+0.14x+0.0018x^2'
'ln(y)=-2.01+0.56x+-0.0079x^2'
pd.DataFrame(points)
| name | w0 | w1 | w2 | |
|---|---|---|---|---|
| 0 | Russia | -0.872558 | 0.258281 | -0.001190 |
| 1 | Germany | 0.770067 | -0.137111 | 0.006168 |
| 2 | Italy | -1.165233 | 0.135692 | 0.001794 |
| 3 | China | -2.014519 | 0.561703 | -0.007927 |
px.scatter_3d(pd.DataFrame(points), x="w0", y="w1", z="w2", color="name")
Сильно отличаются Китай и Германия. Россия подобна Италии